# import library
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
library(mice)
## Warning: package 'mice' was built under R version 4.4.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(forcats)
set.seed(5003)
# Read data
house_market = read.csv('Melbourne_housing_FULL.csv')
glimpse(house_market)
## Rows: 34,857
## Columns: 21
## $ Suburb <chr> "Abbotsford", "Abbotsford", "Abbotsford", "Abbotsford", …
## $ Address <chr> "68 Studley St", "85 Turner St", "25 Bloomburg St", "18/…
## $ Rooms <int> 2, 2, 2, 3, 3, 3, 4, 4, 2, 2, 2, 3, 2, 4, 2, 3, 2, 1, 2,…
## $ Type <chr> "h", "h", "h", "u", "h", "h", "h", "h", "h", "h", "h", "…
## $ Price <int> NA, 1480000, 1035000, NA, 1465000, 850000, 1600000, NA, …
## $ Method <chr> "SS", "S", "S", "VB", "SP", "PI", "VB", "SN", "S", "S", …
## $ SellerG <chr> "Jellis", "Biggin", "Biggin", "Rounds", "Biggin", "Biggi…
## $ Date <chr> "3/09/2016", "3/12/2016", "4/02/2016", "4/02/2016", "4/0…
## $ Distance <chr> "2.5", "2.5", "2.5", "2.5", "2.5", "2.5", "2.5", "2.5", …
## $ Postcode <chr> "3067", "3067", "3067", "3067", "3067", "3067", "3067", …
## $ Bedroom2 <int> 2, 2, 2, 3, 3, 3, 3, 3, 4, 3, 2, 4, 2, 6, 2, NA, NA, 1, …
## $ Bathroom <int> 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, NA, NA, 1, …
## $ Car <int> 1, 1, 0, 1, 0, 1, 2, 2, 2, 1, 0, 0, 1, 0, 2, NA, NA, 1, …
## $ Landsize <int> 126, 202, 156, 0, 134, 94, 120, 400, 201, 202, 181, 245,…
## $ BuildingArea <dbl> NA, NA, 79, NA, 150, NA, 142, 220, NA, NA, NA, 210, 82, …
## $ YearBuilt <int> NA, NA, 1900, NA, 1900, NA, 2014, 2006, 1900, 1900, NA, …
## $ CouncilArea <chr> "Yarra City Council", "Yarra City Council", "Yarra City …
## $ Lattitude <dbl> -37.8014, -37.7996, -37.8079, -37.8114, -37.8093, -37.79…
## $ Longtitude <dbl> 144.9958, 144.9984, 144.9934, 145.0116, 144.9944, 144.99…
## $ Regionname <chr> "Northern Metropolitan", "Northern Metropolitan", "North…
## $ Propertycount <chr> "4019", "4019", "4019", "4019", "4019", "4019", "4019", …
# check unique value
sapply(house_market, n_distinct)
## Suburb Address Rooms Type Price
## 351 34009 12 3 2872
## Method SellerG Date Distance Postcode
## 9 388 78 216 212
## Bedroom2 Bathroom Car Landsize BuildingArea
## 16 12 16 1685 741
## YearBuilt CouncilArea Lattitude Longtitude Regionname
## 161 34 13403 14525 9
## Propertycount
## 343
# check the null value
colSums(is.na(house_market))
## Suburb Address Rooms Type Price
## 0 0 0 0 7610
## Method SellerG Date Distance Postcode
## 0 0 0 0 0
## Bedroom2 Bathroom Car Landsize BuildingArea
## 8217 8226 8728 11810 21115
## YearBuilt CouncilArea Lattitude Longtitude Regionname
## 19306 0 7976 7976 0
## Propertycount
## 0
# remove Unnecessary columns
house_market = house_market %>% select(-Method, -SellerG, -Bedroom2, -Postcode, -Regionname, -Propertycount)
impute the Bathroom, Car,
YearBuilt, Landsize with median in different
suburbs, because median will be impacted less by extrem value and the
house is similar in same suburb.
# impute the bedroom, bathroom, car, YearBuilt, Landsize
house_market$Landsize[house_market$Landsize == 0] = NA # set 0 as NA
house_market = house_market %>%
group_by(Suburb) %>%
mutate(
Bathroom = ifelse(is.na(Bathroom), median(Bathroom, na.rm = TRUE), Bathroom),
Car = ifelse(is.na(Car), median(Car, na.rm = TRUE), Car),
Landsize = ifelse(is.na(Landsize), median(Landsize, na.rm = TRUE), Landsize),
YearBuilt = ifelse(is.na(YearBuilt), median(YearBuilt, na.rm = TRUE), YearBuilt)
) %>%
ungroup()
Using multiple imputation to impute the BuildingArea
because this columns has large percentage Na value we need to use other
column to predict
# MICE to impute the BuildingArea
impute_data = house_market[, c("Landsize", "BuildingArea")]
pred = make.predictorMatrix(impute_data)
pred[,] = 0
pred["BuildingArea", "Landsize"] = 1
imp = mice(impute_data, m = 5, method = "pmm", predictorMatrix = pred, maxit = 10, seed = 5003)
##
## iter imp variable
## 1 1 Landsize BuildingArea
## 1 2 Landsize BuildingArea
## 1 3 Landsize BuildingArea
## 1 4 Landsize BuildingArea
## 1 5 Landsize BuildingArea
## 2 1 Landsize BuildingArea
## 2 2 Landsize BuildingArea
## 2 3 Landsize BuildingArea
## 2 4 Landsize BuildingArea
## 2 5 Landsize BuildingArea
## 3 1 Landsize BuildingArea
## 3 2 Landsize BuildingArea
## 3 3 Landsize BuildingArea
## 3 4 Landsize BuildingArea
## 3 5 Landsize BuildingArea
## 4 1 Landsize BuildingArea
## 4 2 Landsize BuildingArea
## 4 3 Landsize BuildingArea
## 4 4 Landsize BuildingArea
## 4 5 Landsize BuildingArea
## 5 1 Landsize BuildingArea
## 5 2 Landsize BuildingArea
## 5 3 Landsize BuildingArea
## 5 4 Landsize BuildingArea
## 5 5 Landsize BuildingArea
## 6 1 Landsize BuildingArea
## 6 2 Landsize BuildingArea
## 6 3 Landsize BuildingArea
## 6 4 Landsize BuildingArea
## 6 5 Landsize BuildingArea
## 7 1 Landsize BuildingArea
## 7 2 Landsize BuildingArea
## 7 3 Landsize BuildingArea
## 7 4 Landsize BuildingArea
## 7 5 Landsize BuildingArea
## 8 1 Landsize BuildingArea
## 8 2 Landsize BuildingArea
## 8 3 Landsize BuildingArea
## 8 4 Landsize BuildingArea
## 8 5 Landsize BuildingArea
## 9 1 Landsize BuildingArea
## 9 2 Landsize BuildingArea
## 9 3 Landsize BuildingArea
## 9 4 Landsize BuildingArea
## 9 5 Landsize BuildingArea
## 10 1 Landsize BuildingArea
## 10 2 Landsize BuildingArea
## 10 3 Landsize BuildingArea
## 10 4 Landsize BuildingArea
## 10 5 Landsize BuildingArea
completed_data = complete(imp, 1)
house_market$BuildingArea = completed_data$BuildingArea
Remove the NA in Price because this column is target
value removing can avoid leaking. Remove Na value in
Lattitude and Longtitude these value is hard
to impute
house_market = house_market %>%
filter(
!is.na(Lattitude),
!is.na(Longtitude),
!is.na(Price)
)
# check the null value
colSums(is.na(house_market))
## Suburb Address Rooms Type Price Date
## 0 0 0 0 0 0
## Distance Bathroom Car Landsize BuildingArea YearBuilt
## 0 0 0 5 0 17
## CouncilArea Lattitude Longtitude
## 0 0 0
# remove the Na value which can not be imputed
house_market = house_market %>%
filter(
!is.na(YearBuilt),
!is.na(Landsize)
)
# Add geospatial analysis libraries
library(sf)
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.3
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
# Load shapefile data
shapefile_path <- "shapefile/SAL_2021_AUST_GDA94.shp"
all_suburbs <- st_read(shapefile_path, quiet = TRUE)
# Filter Victoria state suburbs
melbourne_suburbs <- all_suburbs[all_suburbs$STE_NAME21 == "Victoria", ]
print(paste("Victoria SAL contains", nrow(melbourne_suburbs), "suburbs"))
## [1] "Victoria SAL contains 2946 suburbs"
# Prepare individual house data for plotting
house_data <- house_market %>%
filter(!is.na(Lattitude), !is.na(Longtitude), !is.na(Price)) %>%
select(Address, Suburb, Price, Lattitude, Longtitude, Rooms, Type, Bathroom, Car, Landsize, BuildingArea, YearBuilt)
print(paste("Total individual houses for plotting:", nrow(house_data)))
## [1] "Total individual houses for plotting: 20974"
print(paste("Price range: $", format(min(house_data$Price), big.mark = ","), "to $", format(max(house_data$Price), big.mark = ",")))
## [1] "Price range: $ 85,000 to $ 11,200,000"
# Debug: Check house_data
print(paste("House data rows:", nrow(house_data)))
## [1] "House data rows: 20974"
print(paste("Price range:", min(house_data$Price, na.rm = TRUE), "to", max(house_data$Price, na.rm = TRUE)))
## [1] "Price range: 85000 to 11200000"
print("First few rows:")
## [1] "First few rows:"
head(house_data, 3)
## # A tibble: 3 × 12
## Address Suburb Price Lattitude Longtitude Rooms Type Bathroom Car Landsize
## <chr> <chr> <int> <dbl> <dbl> <int> <chr> <dbl> <dbl> <dbl>
## 1 85 Tur… Abbot… 1.48e6 -37.8 145. 2 h 1 1 202
## 2 25 Blo… Abbot… 1.03e6 -37.8 145. 2 h 1 0 156
## 3 5 Char… Abbot… 1.46e6 -37.8 145. 3 h 2 0 134
## # ℹ 2 more variables: BuildingArea <dbl>, YearBuilt <dbl>
# Create individual house price scatter plot (optimized for display)
# Sample data for better performance in preview
set.seed(5003)
sample_data <- house_data[sample(nrow(house_data), min(5000, nrow(house_data))), ]
p1 <- ggplot(sample_data, aes(x = Longtitude, y = Lattitude, color = Price)) +
geom_point(alpha = 0.7, size = 1) +
scale_color_viridis_c(
name = "Price\n(AUD)",
labels = dollar_format(scale = 1e-3, suffix = "K"),
option = "plasma",
trans = "log10"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, hjust = 0.5),
legend.position = "right",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10)
) +
labs(
title = "Melbourne Individual House Price Distribution",
subtitle = paste("Data Source: Melbourne Housing Dataset | Sample of", nrow(sample_data), "houses (from", nrow(house_data), "total)"),
x = "Longitude",
y = "Latitude",
caption = "Note: Each point represents one house, darker colors indicate higher prices"
)
# Display the plot
p1
# Calculate average housing prices for each suburb for boundary map
suburb_prices <- house_market %>%
group_by(Suburb) %>%
summarise(
avg_price = mean(Price, na.rm = TRUE),
median_price = median(Price, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
arrange(desc(avg_price))
# Merge suburb data with shapefile
melbourne_suburbs$SAL_NAME_clean <- toupper(trimws(melbourne_suburbs$SAL_NAME21))
suburb_prices$Suburb_clean <- toupper(trimws(suburb_prices$Suburb))
merged_data <- melbourne_suburbs %>%
left_join(suburb_prices, by = c("SAL_NAME_clean" = "Suburb_clean"))
heatmap_data <- merged_data %>%
filter(!is.na(avg_price))
print(paste("Suburbs with price data for boundary map:", nrow(heatmap_data)))
## [1] "Suburbs with price data for boundary map: 249"
# Create interactive suburb boundary map
heatmap_data$popup_content <- paste0(
"<strong>", heatmap_data$SAL_NAME21, "</strong><br/>",
"Average Price: $", format(heatmap_data$avg_price, big.mark = ",", scientific = FALSE), "<br/>",
"Median Price: $", format(heatmap_data$median_price, big.mark = ",", scientific = FALSE), "<br/>",
"Number of Houses: ", heatmap_data$count
)
pal <- colorNumeric(
palette = "plasma",
domain = heatmap_data$avg_price,
na.color = "transparent"
)
interactive_map <- leaflet(heatmap_data) %>%
addTiles() %>%
addPolygons(
fillColor = ~pal(avg_price),
fillOpacity = 0.7,
color = "white",
weight = 1,
popup = ~popup_content,
highlight = highlightOptions(
weight = 3,
color = "red",
fillOpacity = 0.9,
bringToFront = TRUE
)
) %>%
addLegend(
pal = pal,
values = ~avg_price,
title = "Average Price (AUD)",
position = "bottomright",
labFormat = labelFormat(prefix = "$", big.mark = ",")
)
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +no_defs).
## Need '+proj=longlat +datum=WGS84'
interactive_map
# Most expensive and cheapest individual houses
top_expensive <- house_data %>%
arrange(desc(Price)) %>%
head(10) %>%
select(Address, Suburb, Price, Rooms, Type, Bathroom, Car, Landsize, BuildingArea, YearBuilt)
top_cheap <- house_data %>%
arrange(Price) %>%
head(10) %>%
select(Address, Suburb, Price, Rooms, Type, Bathroom, Car, Landsize, BuildingArea, YearBuilt)
print("Top 10 Most Expensive Houses:")
## [1] "Top 10 Most Expensive Houses:"
print(top_expensive)
## # A tibble: 10 × 10
## Address Suburb Price Rooms Type Bathroom Car Landsize BuildingArea
## <chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6 Cole St Brigh… 1.12e7 4 h 3 2 1400 296
## 2 35 Bevis St Mulgr… 9 e6 3 h 1 1 744 117
## 3 49 Mangarra Rd Cante… 8 e6 5 h 5 4 2079 464.
## 4 49 Lisson Gr Hawth… 7.65e6 4 h 2 4 1690 284
## 5 42 Hampden Rd Armad… 6.80e6 4 h 2 2 335 98
## 6 6 Moorakyne Av Malve… 6.60e6 5 h 3 2 396 465
## 7 15 Barry St Kew 6.5 e6 6 h 6 3 1334 365
## 8 136 Page St Middl… 6.40e6 5 h 2 1 553 308
## 9 307 Beaconsfi… Middl… 6.37e6 4 h 3 3 476 279
## 10 13 Wilks Av Malve… 6.30e6 5 h 4 4 900 435
## # ℹ 1 more variable: YearBuilt <dbl>
print("Top 10 Cheapest Houses:")
## [1] "Top 10 Cheapest Houses:"
print(top_cheap)
## # A tibble: 10 × 10
## Address Suburb Price Rooms Type Bathroom Car Landsize BuildingArea
## <chr> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 202/51 Gordon… Foots… 85000 1 u 1 0 266. 265
## 2 30 Pyne St Caulf… 131000 4 h 1 2 499 155
## 3 171 Moreland … Coburg 145000 4 h 1 1 536 164
## 4 8/6 Ridley St Albion 145000 1 u 1 1 36 106
## 5 17/17 Park St Hawth… 160000 1 u 1 0 322 170
## 6 10/30 Pickett… Foots… 170000 1 u 1 0 30 26
## 7 4/831 Park St Bruns… 170000 1 u 1 0 1250 130
## 8 10/117 Anders… Albion 185000 1 u 1 1 2347 43
## 9 8/132 Rupert … West … 185000 1 u 1 1 379 114
## 10 5/25 Ridley St Albion 190000 2 u 1 1 591 169
## # ℹ 1 more variable: YearBuilt <dbl>